In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputerImputer
<frozen importlib._bootstrap>:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
In [2]:
df = pd.read_csv('E:/covid_19_data.csv')
In [3]:
df.head(50)
Out[3]:
SNo ObservationDate Province/State Country/Region Last Update Confirmed Deaths Recovered
0 1 01/22/2020 Anhui Mainland China 1/22/2020 17:00 1.0 0.0 0.0
1 2 01/22/2020 Beijing Mainland China 1/22/2020 17:00 14.0 0.0 0.0
2 3 01/22/2020 Chongqing Mainland China 1/22/2020 17:00 6.0 0.0 0.0
3 4 01/22/2020 Fujian Mainland China 1/22/2020 17:00 1.0 0.0 0.0
4 5 01/22/2020 Gansu Mainland China 1/22/2020 17:00 0.0 0.0 0.0
5 6 01/22/2020 Guangdong Mainland China 1/22/2020 17:00 26.0 0.0 0.0
6 7 01/22/2020 Guangxi Mainland China 1/22/2020 17:00 2.0 0.0 0.0
7 8 01/22/2020 Guizhou Mainland China 1/22/2020 17:00 1.0 0.0 0.0
8 9 01/22/2020 Hainan Mainland China 1/22/2020 17:00 4.0 0.0 0.0
9 10 01/22/2020 Hebei Mainland China 1/22/2020 17:00 1.0 0.0 0.0
10 11 01/22/2020 Heilongjiang Mainland China 1/22/2020 17:00 0.0 0.0 0.0
11 12 01/22/2020 Henan Mainland China 1/22/2020 17:00 5.0 0.0 0.0
12 13 01/22/2020 Hong Kong Hong Kong 1/22/2020 17:00 0.0 0.0 0.0
13 14 01/22/2020 Hubei Mainland China 1/22/2020 17:00 444.0 17.0 28.0
14 15 01/22/2020 Hunan Mainland China 1/22/2020 17:00 4.0 0.0 0.0
15 16 01/22/2020 Inner Mongolia Mainland China 1/22/2020 17:00 0.0 0.0 0.0
16 17 01/22/2020 Jiangsu Mainland China 1/22/2020 17:00 1.0 0.0 0.0
17 18 01/22/2020 Jiangxi Mainland China 1/22/2020 17:00 2.0 0.0 0.0
18 19 01/22/2020 Jilin Mainland China 1/22/2020 17:00 0.0 0.0 0.0
19 20 01/22/2020 Liaoning Mainland China 1/22/2020 17:00 2.0 0.0 0.0
20 21 01/22/2020 Macau Macau 1/22/2020 17:00 1.0 0.0 0.0
21 22 01/22/2020 Ningxia Mainland China 1/22/2020 17:00 1.0 0.0 0.0
22 23 01/22/2020 Qinghai Mainland China 1/22/2020 17:00 0.0 0.0 0.0
23 24 01/22/2020 Shaanxi Mainland China 1/22/2020 17:00 0.0 0.0 0.0
24 25 01/22/2020 Shandong Mainland China 1/22/2020 17:00 2.0 0.0 0.0
25 26 01/22/2020 Shanghai Mainland China 1/22/2020 17:00 9.0 0.0 0.0
26 27 01/22/2020 Shanxi Mainland China 1/22/2020 17:00 1.0 0.0 0.0
27 28 01/22/2020 Sichuan Mainland China 1/22/2020 17:00 5.0 0.0 0.0
28 29 01/22/2020 Taiwan Taiwan 1/22/2020 17:00 1.0 0.0 0.0
29 30 01/22/2020 Tianjin Mainland China 1/22/2020 17:00 4.0 0.0 0.0
30 31 01/22/2020 Tibet Mainland China 1/22/2020 17:00 0.0 0.0 0.0
31 32 01/22/2020 Washington US 1/22/2020 17:00 1.0 0.0 0.0
32 33 01/22/2020 Xinjiang Mainland China 1/22/2020 17:00 0.0 0.0 0.0
33 34 01/22/2020 Yunnan Mainland China 1/22/2020 17:00 1.0 0.0 0.0
34 35 01/22/2020 Zhejiang Mainland China 1/22/2020 17:00 10.0 0.0 0.0
35 36 01/22/2020 NaN Japan 1/22/2020 17:00 2.0 0.0 0.0
36 37 01/22/2020 NaN Thailand 1/22/2020 17:00 2.0 0.0 0.0
37 38 01/22/2020 NaN South Korea 1/22/2020 17:00 1.0 0.0 0.0
38 39 01/23/2020 Anhui Mainland China 1/23/20 17:00 9.0 0.0 0.0
39 40 01/23/2020 Beijing Mainland China 1/23/20 17:00 22.0 0.0 0.0
40 41 01/23/2020 Chongqing Mainland China 1/23/20 17:00 9.0 0.0 0.0
41 42 01/23/2020 Fujian Mainland China 1/23/20 17:00 5.0 0.0 0.0
42 43 01/23/2020 Gansu Mainland China 1/23/20 17:00 2.0 0.0 0.0
43 44 01/23/2020 Guangdong Mainland China 1/23/20 17:00 32.0 0.0 2.0
44 45 01/23/2020 Guangxi Mainland China 1/23/20 17:00 5.0 0.0 0.0
45 46 01/23/2020 Guizhou Mainland China 1/23/20 17:00 3.0 0.0 0.0
46 47 01/23/2020 Hainan Mainland China 1/23/20 17:00 5.0 0.0 0.0
47 48 01/23/2020 Hebei Mainland China 1/23/20 17:00 1.0 1.0 0.0
48 49 01/23/2020 Heilongjiang Mainland China 1/23/20 17:00 2.0 0.0 0.0
49 50 01/23/2020 Henan Mainland China 1/23/20 17:00 5.0 0.0 0.0
In [ ]:
df.drop(['SNo','LastUpdate'],axis=1,inplace=True)
In [ ]:
 
In [15]:
df.rename(columns={'ObservationDate':'Date','Province/State':'State','Country/Region':'Country'},inplace=True)
In [16]:
df.head(10)
Out[16]:
Date State Country Confirmed Deaths Recovered
0 01/22/2020 Anhui Mainland China 1.0 0.0 0.0
1 01/22/2020 Beijing Mainland China 14.0 0.0 0.0
2 01/22/2020 Chongqing Mainland China 6.0 0.0 0.0
3 01/22/2020 Fujian Mainland China 1.0 0.0 0.0
4 01/22/2020 Gansu Mainland China 0.0 0.0 0.0
5 01/22/2020 Guangdong Mainland China 26.0 0.0 0.0
6 01/22/2020 Guangxi Mainland China 2.0 0.0 0.0
7 01/22/2020 Guizhou Mainland China 1.0 0.0 0.0
8 01/22/2020 Hainan Mainland China 4.0 0.0 0.0
9 01/22/2020 Hebei Mainland China 1.0 0.0 0.0
In [17]:
df['Date']=pd.to_datetime(df['Date'])
In [18]:
df.head(10)
Out[18]:
Date State Country Confirmed Deaths Recovered
0 2020-01-22 Anhui Mainland China 1.0 0.0 0.0
1 2020-01-22 Beijing Mainland China 14.0 0.0 0.0
2 2020-01-22 Chongqing Mainland China 6.0 0.0 0.0
3 2020-01-22 Fujian Mainland China 1.0 0.0 0.0
4 2020-01-22 Gansu Mainland China 0.0 0.0 0.0
5 2020-01-22 Guangdong Mainland China 26.0 0.0 0.0
6 2020-01-22 Guangxi Mainland China 2.0 0.0 0.0
7 2020-01-22 Guizhou Mainland China 1.0 0.0 0.0
8 2020-01-22 Hainan Mainland China 4.0 0.0 0.0
9 2020-01-22 Hebei Mainland China 1.0 0.0 0.0
In [21]:
imputer = SimpleImputer(strategy='constant')
df2 = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
In [23]:
df3 = df2.groupby(['Country','Date'])[['Country','Date','Confirmed','Deaths','Recovered']].sum().reset_index()
In [24]:
df3.head(10)
Out[24]:
Country Date Confirmed Deaths Recovered
0 Azerbaijan 2020-02-28 1.0 0.0 0.0
1 ('St. Martin',) 2020-03-10 2.0 0.0 0.0
2 Afghanistan 2020-02-24 1.0 0.0 0.0
3 Afghanistan 2020-02-25 1.0 0.0 0.0
4 Afghanistan 2020-02-26 1.0 0.0 0.0
5 Afghanistan 2020-02-27 1.0 0.0 0.0
6 Afghanistan 2020-02-28 1.0 0.0 0.0
7 Afghanistan 2020-02-29 1.0 0.0 0.0
8 Afghanistan 2020-03-01 1.0 0.0 0.0
9 Afghanistan 2020-03-02 1.0 0.0 0.0
In [26]:
Countries = df3['Country'].unique()
len(Countries)
Out[26]:
223
In [31]:
for idx in range(0,len(Countries)):
    c = df3[df3['Country']==Countries[idx]].reset_index()
    plt.scatter(np.arange(0,len(c)),c['Confirmed'],color='blue',label='Confirmed')
    plt.scatter(np.arange(0,len(c)),c['Recovered'],color='green',label='Recovered')
    plt.scatter(np.arange(0,len(c)),c['Deaths'],color='red',label='Deaths')
    plt.title(Countries[idx])
    plt.xlabel('Days since the first case')
    plt.ylabel('Number of cases')
    plt.legend()
    plt.show()
In [32]:
df4 = df3.groupby(['Date'])[['Date','Confirmed','Deaths','Recovered']].sum().reset_index()
In [33]:
c=df4
plt.scatter(np.arange(0,len(c)),c['Confirmed'],color='blue',label='Confirmed')
plt.scatter(np.arange(0,len(c)),c['Recovered'],color='green',label='Recovered')
plt.scatter(np.arange(0,len(c)),c['Deaths'],color='red',label='Deaths')
plt.title('World')
plt.xlabel('Days since the first case')
plt.ylabel('Number of Cases')
plt.legend()
plt.show()
In [34]:
len(Countries)
Out[34]:
223
In [ ]: